2BR4_rna_brainspan_enrichment

2/16/2021 can use smartseq make a gene tissue gmt

In [2]:
library(tidyverse)
# library(readr)
library(ggplot2)
library(clusterProfiler)
library(org.Hs.eg.db)
library(pheatmap)
library(caret)  
library(ReactomePA)
library(annotate)
library(seqinr)
In [3]:
save_path = '/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/scrna_enrichment/'
In [4]:
path_genes = '/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/D_mpraanalyze_barcode_allelic//egene_gtex/'
diseases = sort(c('schizo','depress_updatedhoward','bipolar','anxiety','attent','personality','panic','traum','autism','ocd'))
diseases_all = c(diseases, 'all')
get_genes = function(dz){
    file = str_c(path_genes,dz,'.txt')
    genes = read.table(file,header=F,stringsAsFactor=F)$V1
    print(dz)
    print(length(genes))
    return(genes)
}

get_entrez = function(genes){
    entrez_ids = bitr(genes, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")$ENTREZID
#     print(length(entrez_ids))
    return(entrez_ids)
#     return(bitr(genes, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db"))
}

enrichr_res = function(entrez_genes, c_gmt){
    enrichr_res <- enricher(entrez_genes, TERM2GENE=c_gmt,                                            
                                            minGSSize=0,
                                            maxGSSize=1000,                                                
                                            pAdjustMethod = "fdr",
                                            qvalueCutoff = 1,
                                            pvalueCutoff = 1)
    
    if (!is.null(enrichr_res)){
        enrichr_res <- setReadable(enrichr_res, OrgDb=org.Hs.eg.db, keyType="ENTREZID")
        return(data.frame(enrichr_res))}
    return(data.frame())
}

filt_res = function(df){
    return(filter(df,p.adjust<0.05))
}

get_lengths = function(list_dfs){
    for (name in names(list_dfs)){
        print(name)
        if (is.data.frame(list_dfs[[name]])){
            print(dim(list_dfs[[name]])[1])
        }
        else{
            print(length(list_dfs[[name]]))
        }
    }
}

save_dfs = function(list_dfs, save_prefix){
    for (name in names(list_dfs)){
        if (dim(list_dfs[[name]])[1]>0){
            save_filepath = paste0(save_prefix, '_', name,'.csv')
            write.csv(list_dfs[[name]], save_filepath)            
        }

    }
}

                            
# get list of unique genes in list of lists
get_genes_unique = function(list_dfs){
    genelist = sort(unique(do.call('c',lapply(do.call('rbind',list_dfs)$geneID, function(x) strsplit(x,'/')[[1]]))))
    return(genelist)
}
run_enrichment = function(entrez_list, c_gmt, save_prefix){
    enrich_df_list =  sapply(entrez_list, function(x) enrichr_res(x,c_gmt))
    save_dfs(enrich_df_list, save_prefix)
    enrich_df_list_filt = sapply(enrich_df_list, filt_res)
    get_lengths(enrich_df_list_filt)
    print('getting unique genes --all')
    print(get_genes_unique (enrich_df_list)                        )
    print('getting unique genes --pval filt')
    print(get_genes_unique (enrich_df_list_filt)   )
    return(enrich_df_list_filt)
}
 
# rna_df is samples x genes and removed low variances genes
remove_lowvar_genes = function(rna_df){
    nzv_cols <- nearZeroVar(rna_df)
    print(dim(rna_df))
    if(length(nzv_cols) > 0) rna_df <- rna_df[, -nzv_cols]
    print(dim(rna_df))
    return(rna_df)

}

save_pheatmap_png <- function(x, filename, width=1200, height=1000, res = 200) {
  png(filename, width = width, height = height, res = res)
  grid::grid.newpage()
  grid::grid.draw(x$gtable)
  dev.off()
}

save_pheatmap_pdf <- function(x, filename, width=7, height=7) {
   stopifnot(!missing(x))
   stopifnot(!missing(filename))
   pdf(filename, width=width, height=height)
   grid::grid.newpage()
   grid::grid.draw(x$gtable)
   dev.off()
}



# get_expr
In [5]:
file = '/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/D_mpraanalyze_barcode_allelic/interesting_egenes.csv'
interesting_egenes = read.csv(file,header=F,stringsAsFactor=F)$V1
# interesting_egenes
In [6]:
file = '/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/D_mpraanalyze_barcode_allelic/interesting_egenes.csv'
interesting_egenes = read.csv(file,header=F,stringsAsFactor=F)$V1
# interesting_egenes
In [7]:
dz_genes = sapply(diseases_all, get_genes)
[1] "anxiety"
[1] 18
[1] "attent"
[1] 46
[1] "autism"
[1] 1
[1] "bipolar"
[1] 129
[1] "depress_updatedhoward"
[1] 101
[1] "ocd"
[1] 20
[1] "panic"
[1] 4
[1] "personality"
[1] 12
[1] "schizo"
[1] 308
[1] "traum"
[1] 5
[1] "all"
[1] 429
In [8]:
dz_gene_list_entrez = sapply(dz_genes,get_entrez)
get_lengths(dz_gene_list_entrez)
'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db"):
“4.35% of input gene IDs are fail to map...”
'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db"):
“3.88% of input gene IDs are fail to map...”
'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db"):
“0.99% of input gene IDs are fail to map...”
'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db"):
“5% of input gene IDs are fail to map...”
'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db"):
“3.9% of input gene IDs are fail to map...”
'select()' returned 1:1 mapping between keys and columns

'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(genes, fromType = "SYMBOL", toType = "ENTREZID", OrgDb = "org.Hs.eg.db"):
“3.26% of input gene IDs are fail to map...”
[1] "anxiety"
[1] 18
[1] "attent"
[1] 44
[1] "autism"
[1] 1
[1] "bipolar"
[1] 124
[1] "depress_updatedhoward"
[1] 100
[1] "ocd"
[1] 19
[1] "panic"
[1] 4
[1] "personality"
[1] 12
[1] "schizo"
[1] 296
[1] "traum"
[1] 5
[1] "all"
[1] 415
In [10]:
# get entrez
length(dz_genes$all)
all_sym_to_entrez = bitr(dz_genes$all, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")#$ENTREZID
str(all_sym_to_entrez)
429
'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(dz_genes$all, fromType = "SYMBOL", toType = "ENTREZID", :
“3.26% of input gene IDs are fail to map...”
'data.frame':	415 obs. of  2 variables:
 $ SYMBOL  : chr  "AADAT" "ABCB9" "ABCC8" "ABHD16A" ...
 $ ENTREZID: chr  "51166" "23457" "6833" "7920" ...

create geneset (gmt dataframe) with geneset - gene long version genes in entrez

In [11]:
metadata_df = read.csv('../data/external/brainmap_cortex_smartseq/metadata.csv')
head(metadata_df)
A data.frame: 6 × 41
sample_nameexp_component_namespecimen_typecluster_colorcluster_ordercluster_labelclass_colorclass_orderclass_labelsubclass_colorcell_type_alt_alias_ordercell_type_alt_alias_labelcell_type_designation_colorcell_type_designation_ordercell_type_designation_labelexternal_donor_name_colorexternal_donor_name_orderexternal_donor_name_labeloutlier_calloutlier_type
<fct><fct><fct><fct><int><fct><fct><int><fct><fct><int><fct><fct><int><fct><fct><int><fct><fct><fct>
1F2S4_160113_027_A01LS-15005h_S01_E1-50nucleus NA NA NA NA #3DCC3D2H200.1025True Outlier L1-3 SST OR2AD1P
2F2S4_160113_027_B01LS-15005h_S02_E1-50nucleus#E170FE32Inh L2-5 VIP TOX2 #0066FF 4GABAergic#99651732#E170FE32Neuron 032#3DCC3D2H200.1025False
3F2S4_160113_027_C01LS-15005h_S03_E1-50nucleus#8E5864 2Inh L1 LAMP5 GGT8P #0066FF 4GABAergic#FF7373 2#8E5864 2Neuron 002#3DCC3D2H200.1025False
4F2S4_160113_027_D01LS-15005h_S04_E1-50nucleus#8B5862 1Inh L1 LAMP5 NDNF #0066FF 4GABAergic#FF7373 1#8B5862 1Neuron 001#3DCC3D2H200.1025False
5F2S4_160113_027_E01LS-15005h_S05_E1-50nucleus#CF6EC934Inh L1-3 VIP ZNF322P1#0066FF 4GABAergic#99651734#CF6EC934Neuron 034#3DCC3D2H200.1025False
6F2S4_160113_027_F01LS-15005h_S06_E1-50nucleus#E693CE35Inh L3 VIP CBLN1 #0066FF 4GABAergic#99651735#E693CE35Neuron 035#3DCC3D2H200.1025False
In [12]:
## efforts to try to annotate with cortical layer and region label, might need another heatmap?? (toDO later)

metadata_df%>%
    dplyr::select(cluster_label, class_label,subclass_label,region_label,cortical_layer_label)%>%
    mutate(cluster_id = str_replace_all(cluster_label, "[ .-]", "."))%>%
    filter(cluster_label!='')%>%
    distinct()%>%
    group_by(cluster_id, class_label, region_label)%>%#,cortical_layer_label)%>%
    tally()
A grouped_df: 834 × 4
cluster_idclass_labelregion_labeln
<chr><fct><fct><int>
Astro.L1.6.FGFR3.ETNPPLNon-neuronalA1C 8
Astro.L1.6.FGFR3.ETNPPLNon-neuronalCgG 5
Astro.L1.6.FGFR3.ETNPPLNon-neuronalM1lm5
Astro.L1.6.FGFR3.ETNPPLNon-neuronalM1ul5
Astro.L1.6.FGFR3.ETNPPLNon-neuronalMTG 6
Astro.L1.6.FGFR3.ETNPPLNon-neuronalS1lm7
Astro.L1.6.FGFR3.ETNPPLNon-neuronalS1ul7
Astro.L1.6.FGFR3.ETNPPLNon-neuronalV1C 8
Astro.L1.FGFR3.FOS Non-neuronalA1C 3
Astro.L1.FGFR3.FOS Non-neuronalCgG 3
Astro.L1.FGFR3.FOS Non-neuronalM1lm2
Astro.L1.FGFR3.FOS Non-neuronalM1ul2
Astro.L1.FGFR3.FOS Non-neuronalMTG 4
Astro.L1.FGFR3.FOS Non-neuronalS1lm2
Astro.L1.FGFR3.FOS Non-neuronalS1ul2
Astro.L1.FGFR3.FOS Non-neuronalV1C 2
Astro.L1.FGFR3.MT1G Non-neuronalA1C 5
Astro.L1.FGFR3.MT1G Non-neuronalCgG 3
Astro.L1.FGFR3.MT1G Non-neuronalM1lm3
Astro.L1.FGFR3.MT1G Non-neuronalM1ul3
Astro.L1.FGFR3.MT1G Non-neuronalMTG 3
Astro.L1.FGFR3.MT1G Non-neuronalS1lm2
Astro.L1.FGFR3.MT1G Non-neuronalS1ul2
Astro.L1.FGFR3.MT1G Non-neuronalV1C 4
Endo.L2.5.CLDN5 Non-neuronalA1C 4
Endo.L2.5.CLDN5 Non-neuronalCgG 4
Endo.L2.5.CLDN5 Non-neuronalM1lm3
Endo.L2.5.CLDN5 Non-neuronalM1ul2
Endo.L2.5.CLDN5 Non-neuronalMTG 5
Endo.L2.5.CLDN5 Non-neuronalS1lm2
Oligo.L4.6.OPALINNon-neuronalA1C 8
Oligo.L4.6.OPALINNon-neuronalCgG 5
Oligo.L4.6.OPALINNon-neuronalM1lm4
Oligo.L4.6.OPALINNon-neuronalM1ul5
Oligo.L4.6.OPALINNon-neuronalMTG 6
Oligo.L4.6.OPALINNon-neuronalS1lm6
Oligo.L4.6.OPALINNon-neuronalS1ul7
Oligo.L4.6.OPALINNon-neuronalV1C 8
OPC.L1.6.MYT1 Non-neuronalA1C 8
OPC.L1.6.MYT1 Non-neuronalCgG 5
OPC.L1.6.MYT1 Non-neuronalM1lm5
OPC.L1.6.MYT1 Non-neuronalM1ul5
OPC.L1.6.MYT1 Non-neuronalMTG 6
OPC.L1.6.MYT1 Non-neuronalS1lm7
OPC.L1.6.MYT1 Non-neuronalS1ul7
OPC.L1.6.MYT1 Non-neuronalV1C 8
Peri.L1.6.MUSTN1 Non-neuronalA1C 2
Peri.L1.6.MUSTN1 Non-neuronalCgG 2
Peri.L1.6.MUSTN1 Non-neuronalM1lm2
Peri.L1.6.MUSTN1 Non-neuronalM1ul1
Peri.L1.6.MUSTN1 Non-neuronalMTG 6
Peri.L1.6.MUSTN1 Non-neuronalS1lm1
Peri.L1.6.MUSTN1 Non-neuronalS1ul3
Peri.L1.6.MUSTN1 Non-neuronalV1C 3
VLMC.L1.3.CYP1B1 Non-neuronalA1C 1
VLMC.L1.3.CYP1B1 Non-neuronalCgG 2
VLMC.L1.3.CYP1B1 Non-neuronalM1lm1
VLMC.L1.3.CYP1B1 Non-neuronalMTG 2
VLMC.L1.3.CYP1B1 Non-neuronalS1ul1
VLMC.L1.3.CYP1B1 Non-neuronalV1C 1
In [13]:
# brainmap_expr_df_region = read.csv('../data/processed/fig1/rna_10xm1/brainmap_expr_df_region.csv',row.names=1)
brainmap_expr_df_region = read.csv('../data/external/brainmap_cortex_smartseq/medians.csv',row.names=1)%>%dplyr::select(-X)
brainmap_expr_df_region = data.frame(t(brainmap_expr_df_region))
In [14]:
dim(brainmap_expr_df_region)
head(brainmap_expr_df_region[,1:5])
  1. 120
  2. 50281
A data.frame: 6 × 5
X3.8.1.2X3.8.1.3X3.8.1.4X3.8.1.5X5.HT3C2
<dbl><dbl><dbl><dbl><dbl>
Exc.L5.6.FEZF2.ANKRD20A100000
Exc.L5.6.THEMIS.TMEM23300000
Inh.L1.LAMP5.NDNF00000
Exc.L6.FEZF2.CPZ00000
Astro.L1.FGFR3.MT1G00000
Exc.L2.3.LINC00507.RPL9P1700000
In [15]:
# get entrez
length(dz_genes$all)
all_sym_to_entrez = bitr(dz_genes$all, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")#$ENTREZID
str(all_sym_to_entrez)
429
'select()' returned 1:1 mapping between keys and columns

Warning message in bitr(dz_genes$all, fromType = "SYMBOL", toType = "ENTREZID", :
“3.26% of input gene IDs are fail to map...”
'data.frame':	415 obs. of  2 variables:
 $ SYMBOL  : chr  "AADAT" "ABCB9" "ABCC8" "ABHD16A" ...
 $ ENTREZID: chr  "51166" "23457" "6833" "7920" ...
In [16]:
# egenes expressed in a brain tissue
geneset = dz_genes$all#unique(c(dz_genes$all,interesting_egenes))
region_gene_df = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df)
region_gene_df = region_gene_df[,colSums(region_gene_df)>0]
dim(region_gene_df)
294/416
  1. 120
  2. 416
  1. 120
  2. 294
0.706730769230769

70% of egenes are expressed in at least 1 brain cortex cell type (SMART seq 2018)

In [25]:
# select egenes from expr df
geneset = intersect(dz_genes$all, interesting_egenes)#unique(c(dz_genes$all,interesting_egenes))
region_gene_df = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df)
# region_gene_df[region_gene_df<=4] = 0
region_gene_df = remove_lowvar_genes(region_gene_df)
region_gene_df_log = log2(t(region_gene_df)+1)
dim(region_gene_df_log)
p = pheatmap(t(region_gene_df_log))
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_all.pdf'),width=11,height=18)
par(bg='white')
plot(p$tree_col)
dev.off()
  1. 120
  2. 90
[1] 120  90
[1] 120  48
  1. 48
  2. 120
pdf: 2
null device: 1
In [21]:
# select egenes from expr df schizo for 
geneset = intersect(dz_genes$schizo, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_schizo.pdf'),width=20,height=10)
par(bg='white')
plot(p$tree_col)
dev.off()
  1. 120
  2. 66
  1. 42
  2. 120
pdf: 2
null device: 1
In [22]:
# select egenes from expr df schizo for 
geneset = intersect(dz_genes$bipolar, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_bipolar.pdf'),width=20,height=5)
par(bg='white')
plot(p$tree_col)
dev.off()
  1. 120
  2. 28
  1. 17
  2. 120
pdf: 2
null device: 1
In [23]:
# select egenes from expr df schizo for 
geneset = intersect(dz_genes$depress_updatedhoward, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_depress.pdf'),width=20,height=5)
par(bg='white')
plot(p$tree_col)
dev.off()
  1. 120
  2. 21
  1. 11
  2. 120
pdf: 2
null device: 1
In [25]:
# select egenes from expr df schizo for 
bpd_geneset = intersect(dz_genes$bipolar, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
mdd_geneset = intersect(dz_genes$depress_updatedhoward, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
geneset = union(bpd_geneset, mdd_geneset)
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_bipolardepress.pdf'),width=20,height=5)
par(bg='white')
plot(p$tree_col)
dev.off()
  1. 120
  2. 41
  1. 26
  2. 120
pdf: 2
null device: 1
In [141]:
# select egenes from expr df interesting for 
geneset = interesting_egenes
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
region_gene_df_dz[region_gene_df_dz<=1] = 0
region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_vignette.pdf'),width=20,height=15)
par(bg='white')
plot(p$tree_col)
dev.off()
  1. 120
  2. 103
[1] 120 103
[1] 120  55
pdf: 2
null device: 1
In [ ]:

analysis for bpd and scz today (supplemental of figure 3 eGene networks)

In [109]:
# select egenes from expr df schizo for 
bpd_geneset = intersect(dz_genes$bipolar, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
scz_geneset = intersect(dz_genes$schizo, interesting_egenes)#unique(c(dz_genes$schizo,interesting_egenes))
geneset = union(bpd_geneset, scz_geneset)
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
dim(region_gene_df_dz_log)
dz_annon = rbind(data.frame(gene = bpd_geneset, disease='BPD'), 
      data.frame(gene = scz_geneset, disease='SCZ'))
dz_annon = dz_annon = dz_annon%>%
    group_by(gene)%>%
    mutate(disease = as.character(paste0(disease, collapse = "|")))%>%
    distinct()%>%
    ungroup()%>%
    mutate(gene = as.character(gene))%>%
    filter(gene %in% rownames(region_gene_df_dz_log))%>%
    column_to_rownames('gene')
#     ungroup()%>%
dim(dz_annon)
cell_annon= metadata_df%>%
    dplyr::select(cluster_label, class_label)%>%
    mutate(cluster_label = str_replace_all(cluster_label, "[ .-]", "."))%>%
    distinct()%>%
    filter(class_label!='')%>%
    filter(cluster_label %in% colnames(region_gene_df_dz_log))%>%
    column_to_rownames('cluster_label')
dim(cell_annon)
p = pheatmap(region_gene_df_dz_log,
           annotation_row=dz_annon,
            annotation_col = cell_annon)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_bipolarschizo.pdf'),width=20,height=12)
par(bg='white')
plot(p$tree_col)
dev.off()
  1. 120
  2. 70
  1. 44
  2. 120
  1. 44
  2. 1
  1. 120
  2. 1
pdf: 2
pdf: 3
In [141]:
library(ggfortify)
library(cluster)

pca_res <- prcomp(region_gene_df_dz_log)
pca_res_df <- as.data.frame(pca_res$x)
pca_res_df = cbind(dz_annon,pca_res_df)
pca_res_df$gene = rownames(pca_res_df)
ggplot(pca_res_df, aes(x=PC1, y=PC2, label=gene,color=disease))+
    geom_point()+
geom_text(position=position_jitter(width=1,height=1))+
    theme_classic()+ggtitle('PCA BrainMap Cortical Neuron Type')
ggsave(paste0(save_path, 'pca_bipolarschizo.pdf'))
autoplot(pca_res, label=TRUE, label.size = 3,
         loadings = TRUE, loadings.label = TRUE, loadings.label.size  = 3)
Saving 7 x 7 in image

In [ ]:

In [68]:
mono_poly_genes = read.csv('/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/omim_enrichment/mono_poly_overlap_genelist.txt',
        stringsAsFactor=F, header=F)$V1
length(mono_poly_genes)
87
In [78]:
# select egenes and get expression matrix
geneset = mono_poly_genes
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
# region_gene_df_dz[region_gene_df_dz<=5] = 0
# region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
region_gene_df_dz_log = region_gene_df_dz_log[rowSums(region_gene_df_dz_log)>.1,]
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
print('post expression filter per gene')
dim(region_gene_df_dz_log)
dz_annon = read.csv('/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/D_mpraanalyze_barcode_allelic/egene_gtex/all_string_protein_annotations_w_dz.csv',
                     stringsAsFactor=F)
dz_annon = dz_annon%>%
    dplyr::select(X.node, diseases)%>%
    filter(X.node %in% rownames(region_gene_df_dz_log))%>%
    column_to_rownames('X.node')
print('dz annon dimensions')
dim(dz_annon)

cell_annon= metadata_df%>%
    dplyr::select(cluster_label, class_label)%>%
    mutate(cluster_label = str_replace_all(cluster_label, "[ .-]", "."))%>%
    distinct()%>%
    filter(class_label!='')%>%
    filter(cluster_label %in% colnames(region_gene_df_dz_log))%>%
    column_to_rownames('cluster_label')
print('cell type annon dimensions')
dim(cell_annon)
p = pheatmap(region_gene_df_dz_log,
           annotation_row=dz_annon,
            annotation_col = cell_annon)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_mono_poly.pdf'),width=20,height=12)
par(bg='white')
plot(p$tree_col)
dev.off()
  1. 120
  2. 84
[1] "post expression filter per gene"
  1. 69
  2. 120
[1] "dz annon dimensions"
  1. 69
  2. 1
[1] "cell type annon dimensions"
  1. 120
  2. 1
pdf: 2
pdf: 3
In [103]:
library(ggfortify)
library(cluster)

pca_res <- prcomp(region_gene_df_dz_log)
pca_res_df <- as.data.frame(pca_res$x)
# autoplot(pca_res, label=TRUE, label.size = 3)#,
#          loadings = TRUE, loadings.label = TRUE, loadings.label.size  = 3)
In [108]:
dz_annon
A data.frame: 69 × 1
diseases
<chr>
ABCC8BPD|SCZ
ACADMADHD
ACO2MDD
AGTADHD|BPD|SCZ
ANK3MDD|SCZ
AP3B2PTSD
APOPT1BPD|SCZ
ARL3ADHD|BPD|SCZ
ATXN7SCZ
BANF1SCZ
BOLA3PTSD
CDK10OCD
CEP85LADHD|BPD
CHKBSCZ
CHRNA2SCZ
CNNM2ADHD|BPD|SCZ
CNOT1BPD|SCZ
CYP2D6BPD|MDD|SCZ
DCCMDD
DDHD2SCZ
EP300MDD
FADS1BPD
FADS2BPD
FADS3BPD
FANCASCZ
FBXL4MDD
FGFR1GAD
FGFR3OCD
FIBPSCZ
GNAI2SCZ
MMABBPD|SCZ
MTHFD1MDD
MTHFRSCZ
MYRFBPD
NDUFA2SCZ
NDUFA6BPD|SCZ
NEK1SCZ
NFKB1SCZ
NT5C2SCZ
PARK7MDD
PCCBSCZ
PIGQADHD
PPP2R3CBPD|SCZ
PREPLGAD
PRMT7SCZ
REREMDD
RNASEH2CSCZ
RSRC1MDD
SDCCAG8SCZ
SLC30A9MDD
SPTLC1BPD|SCZ
SREBF1BPD|SCZ
SUFUBPD|SCZ
SZT2SCZ
TKTBPD|SCZ
TMX2SCZ
TPRKBSCZ
VARS2ADHD|BPD|SCZ
WASF1MDD
WDR73SCZ
In [104]:
cell_annon
A data.frame: 120 × 1
class_label
<fct>
Inh.L2.5.VIP.TOX2GABAergic
Inh.L1.LAMP5.GGT8PGABAergic
Inh.L1.LAMP5.NDNFGABAergic
Inh.L1.3.VIP.ZNF322P1GABAergic
Inh.L3.VIP.CBLN1GABAergic
Inh.L1.4.LAMP5.DUSP4GABAergic
Exc.L2.3.LINC00507.RPL9P17Glutamatergic
Inh.L1.SST.CXCL14GABAergic
Inh.L1.PAX6.GRIP2GABAergic
Inh.L1.2.VIP.PPAPDC1AGABAergic
Oligo.L4.6.OPALINNon-neuronal
Inh.L1.PAX6.CA4GABAergic
Inh.L1.ADARB2.ADAM33GABAergic
Inh.L1.4.VIP.CHRNA2GABAergic
Astro.L1.6.FGFR3.ETNPPLNon-neuronal
Inh.L2.6.VIP.VIPGABAergic
Inh.L1.6.LAMP5.CA13GABAergic
Exc.L5.6.THEMIS.GPR21Glutamatergic
Exc.L5.6.FEZF2.MYBPHLGlutamatergic
Exc.L4.5.RORB.RPL31P31Glutamatergic
Exc.L4.5.RORB.LCN15Glutamatergic
Inh.L4.6.SST.MTHFD2P6GABAergic
Exc.L6.THEMIS.LINC00343Glutamatergic
Exc.L6.FEZF2.FAM95CGlutamatergic
Exc.L4.5.RORB.LINC01474Glutamatergic
OPC.L1.6.MYT1Non-neuronal
Inh.L5.6.PVALB.FAM150BGABAergic
Exc.L6.FEZF2.KRT17Glutamatergic
Inh.L5.PVALB.CNTNAP3P2GABAergic
Inh.L5.6.LAMP5.SFTA3GABAergic
Exc.L2.4.RORB.GRIK1Glutamatergic
Inh.L1.6.VIP.PENKGABAergic
Inh.L1.2.PVALB.TAC1GABAergic
Inh.L1.3.PAX6.NABP1GABAergic
Inh.L1.3.VIP.CCDC184GABAergic
Inh.L1.6.VIP.RCN1GABAergic
Exc.L3.5.FEZF2.ONECUT1Glutamatergic
Exc.L6.FEZF2.TBCCGlutamatergic
Exc.L3.5.RORB.CD24Glutamatergic
VLMC.L1.3.CYP1B1Non-neuronal
Exc.L5.6.THEMIS.IL7RGlutamatergic
Exc.L4.RORB.BHLHE22Glutamatergic
Exc.L4.RORB.CACNG5Glutamatergic
Exc.L4.5.RORB.AIM2Glutamatergic
Exc.L4.6.RORB.HPCAGlutamatergic
Exc.L6.THEMIS.EGR3Glutamatergic
Exc.L6.FEZF2.VWA2Glutamatergic
Inh.L4.5.PVALB.TRIM67GABAergic
Exc.L4.5.RORB.ASCL1Glutamatergic
Exc.L5.6.FEZF2.RSAD2Glutamatergic
Exc.L3.LINC00507.PSRC1Glutamatergic
Exc.L3.4.RORB.RPS3P6Glutamatergic
Exc.L5.FEZF2.MORN2Glutamatergic
Exc.L3.5.LINC00507.SLNGlutamatergic
Exc.L3.5.THEMIS.UBE2FGlutamatergic
Exc.L3.5.FEZF2.DCNGlutamatergic
Exc.L4.RORB.CCDC168Glutamatergic
Exc.L3.LINC00507.CTXN3Glutamatergic
Exc.L3.THEMIS.PLA2G7Glutamatergic
Exc.L5.FEZF2.DYRK2Glutamatergic
In [90]:
# get expressed cell types
write.table(data.frame(sort(rownames(region_gene_df_dz_log))),
            '/Users/mguo123/Google Drive/1_khavari/noncancer_project/miseq/novogene_071420/omim_enrichment/mono_poly_overlap_genelist_scrna_filt.txt',
            quote=F, col.names=F, row.names=F)
In [ ]:

In [ ]:
1za
In [ ]:
# select egenes from expr df interesting for 
geneset = c('MTHFR', 'AKT1', 'C4A', 'CHRNA2')
region_gene_df_dz = brainmap_expr_df_region[,geneset[geneset%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
region_gene_df_dz[region_gene_df_dz<=1] = 0
region_gene_df_dz = remove_lowvar_genes(region_gene_df_dz)
region_gene_df_dz_log = log2(t(region_gene_df_dz)+1)
# region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))
p = pheatmap(region_gene_df_dz_log)
save_pheatmap_pdf(p,paste0(save_path, 'region_gene_df_omim.pdf'),width=20,height=15)
par(bg='white')
plot(p$tree_col)
dev.off()
In [89]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:

In [ ]:
# select egenes from expr df
region_gene_df_dz = brainmap_expr_df_region[,dz_genes$schizo[dz_genes$schizo%in%colnames(brainmap_expr_df_region)]]
dim(region_gene_df_dz)
region_gene_df_dz[region_gene_df_dz<=5] = 0
region_gene_df_dz_log = log10(t(remove_lowvar_genes(region_gene_df_dz))+1)
region_gene_df_dz_log_norm = as.data.frame(scale(region_gene_df_dz_log, center = TRUE, scale = TRUE))

pheatmap(region_gene_df_dz_log_norm)
In [73]:
# make gmt object
region_gene_df_long_filt = region_gene_df%>%
    rownames_to_column('region')%>%
    gather(gene,tpm, -region)%>%
    filter(tpm>5)%>%
    inner_join(all_sym_to_entrez, by=c('gene'='SYMBOL'))
# region_gene_df_long_annon = region_gene_df_long_filt%>%
#     column_to_rownames('gene')%>%
dim(region_gene_df_long_filt)
str(region_gene_df_long_filt)
  1. 721
  2. 4
'data.frame':	721 obs. of  4 variables:
 $ region  : chr  "A1C" "M1lm" "M1ul" "MTG" ...
 $ gene    : chr  "ABCB9" "ABCB9" "ABCB9" "ABCB9" ...
 $ tpm     : num  8 8 11 17 9 7 11 20 32 22 ...
 $ ENTREZID: chr  "23457" "23457" "23457" "23457" ...
In [74]:
region_gene_gmt = region_gene_df_long_filt%>%
    mutate(region = as.factor(region))%>%
    dplyr::select(region, ENTREZID)
str(region_gene_gmt)
'data.frame':	721 obs. of  2 variables:
 $ region  : Factor w/ 8 levels "A1C","CgG","M1lm",..: 1 3 4 5 6 7 8 1 2 3 ...
 $ ENTREZID: chr  "23457" "23457" "23457" "23457" ...

run enrichments

In [75]:
# compareCluster(geneCluster = dz_gene_list_entrez, fun = "enrichGO", 
#                                 pAdjustMethod='BH', 
#                                 pvalueCutoff  = 1,
#                                 qvalueCutoff  = 1,
#                                 OrgDb='org.Hs.eg.db', 
#                                 ont = "BP",
#                                 readable=TRUE)
In [76]:
dz_egene_scRNAregion = run_enrichment(entrez_list=dz_gene_list_entrez,
                          c_gmt = region_gene_gmt,
                          save_prefix=paste0(save_path, 'dz_egene_scRNAregion'))
--> No gene can be mapped....

--> Expected input gene ID: 9527,146691,84940,6733,54805,2185

--> return NULL...

--> No gene can be mapped....

--> Expected input gene ID: 23035,2023,54664,23248,11237,2185

--> return NULL...

[1] "anxiety"
[1] 0
[1] "attent"
[1] 0
[1] "autism"
[1] 0
[1] "bipolar"
[1] 0
[1] "depress"
[1] 0
[1] "ocd"
[1] 0
[1] "panic"
[1] 0
[1] "personality"
[1] 0
[1] "schizo"
[1] 0
[1] "traum"
[1] 0
[1] "all"
[1] 0
[1] "getting unique genes --all"
  [1] "ABCB9"    "ABCC8"    "ACADM"    "ALMS1"    "ANK3"     "AP3B2"   
  [7] "ARL3"     "ARL6IP4"  "ARNT"     "ASAP1"    "ATP6V1D"  "ATXN7"   
 [13] "BAG4"     "BOLA3"    "CAST"     "CCDC25"   "CD46"     "CDIP1"   
 [19] "CDK10"    "CEP170"   "CEP85L"   "CKAP5"    "CKB"      "CNNM2"   
 [25] "CNOT1"    "CORO6"    "DCC"      "DCP1A"    "DDHD2"    "DDX27"   
 [31] "ENO1"     "EP300"    "ERAP1"    "EVI5"     "FADS3"    "FBXL16"  
 [37] "FBXL20"   "FBXL4"    "FLOT1"    "FYN"      "GFM1"     "GLT8D1"  
 [43] "GOSR1"    "GPHN"     "HCG18"    "IK"       "IMMP2L"   "JKAMP"   
 [49] "KLC1"     "MADD"     "MAN2A1"   "MANBA"    "MPHOSPH9" "MPP6"    
 [55] "NAALAD2"  "NCR3LG1"  "NDRG4"    "NEGR1"    "NEK1"     "NEK4"    
 [61] "NFATC3"   "NFKB1"    "NSRP1"    "NT5C2"    "NUCB2"    "PARK7"   
 [67] "PBX1"     "PDCD11"   "PHLPP2"   "PIK3C2A"  "PITPNM2"  "PPP3R1"  
 [73] "PREPL"    "PRMT7"    "PTK2B"    "RAB40C"   "RANGAP1"  "RBM6"    
 [79] "RERE"     "RHOT2"    "RNF24"    "RPAP2"    "RPRD2"    "RPS18"   
 [85] "RSRC1"    "RTN1"     "SDCCAG8"  "SH3YL1"   "SLC30A9"  "SLC7A6"  
 [91] "SNX19"    "SNX32"    "SPTLC1"   "SRPK2"    "SSH2"     "ST13"    
 [97] "SULT4A1"  "SUZ12P1"  "SV2A"     "TAPBP"    "THOC7"    "TMEM106B"
[103] "TOM1L2"   "TTC7B"    "UBE2D3"   "VPS45"    "WASF1"    "YJEFN3"  
[109] "ZNF204P" 
[1] "getting unique genes --pval filt"
NULL

No enrichment so just make an annotation in the big table

make annotation table

In [79]:
region_gene_df_long_filt%>%
    group_by(gene)%>%tally()%>%arrange(desc(n))
A tibble: 109 × 2
genen
<chr><int>
ABCC8 8
ACADM 8
ALMS1 8
ANK3 8
AP3B2 8
ASAP1 8
ATXN7 8
CAST 8
CD46 8
CEP170 8
CEP85L 8
CKAP5 8
CKB 8
CNNM2 8
CORO6 8
DCP1A 8
ENO1 8
EP300 8
ERAP1 8
EVI5 8
FBXL20 8
FLOT1 8
FYN 8
GPHN 8
IMMP2L 8
KLC1 8
MADD 8
MAN2A1 8
MPHOSPH98
MPP6 8
ATP6V1D6
CCDC25 6
FBXL16 6
NFATC3 6
PRMT7 6
RANGAP16
CNOT1 5
DDX27 5
NCR3LG15
PDCD11 5
RAB40C 5
JKAMP 4
SPTLC1 4
FBXL4 3
GLT8D1 3
IK 3
NSRP1 3
PARK7 3
TAPBP 3
ARNT 2
BAG4 2
CDK10 2
FADS3 2
MANBA 2
BOLA3 1
CDIP1 1
DDHD2 1
NEK4 1
SNX19 1
YJEFN3 1
In [ ]: